This is a report of the modular analysis on the EKD11 transcriptomics dataset/
I have used the Singhania/O’Garra blood modules, using Qusage to quantify the relative encrihment of the modules.
Qusage will assess the mean expression of all genes in a geneset (module) within a treatment group and compare against a control group, and output the log2Fold Enrichment of for each module.
This will load the necessarys packages for the analysis, as well as load the module data.
This analysis requires us to have loaded the samples into DESeq2 to identify the genes found to be differentially expressed (DE) in each comparison, as well as for a log transformed normalised count matrix (normalised to library size).
rm(list = ls())
# Libraries
library(DESeq2)
library(ComplexHeatmap)
library(tidyverse)
library(qusage)
library(ggplot2)
library(reshape2)
library(RColorBrewer)
library(scales)
nf.dir <- "/Users/alderc/1-projects/CAMP/1-AS_timecourse/2-EKD11/1-Pipeline/1-Nextflow/"
work.dir <- "/Users/alderc/1-projects/1-PIR_project/3-EKD11_AS_Early_TP/"
r.dir <- paste(work.dir, "1-R/",sep='') #change once refactor is completed
tmp.dir <- paste(work.dir,"tmp/",sep='')
data.dir <- paste(work.dir,"4-data/",sep='')
results.dir <- paste(work.dir,"2-results_2020/",sep='')
genome.dir <- "/Users/alderc/1-projects/9-Data/1-Reference_genomes/1-Mus_musculus/"
module.anno <- read.csv(file=paste(data.dir, '/modular_analysis/Annotation of Blood modules.csv', sep=''),
header=TRUE,
sep=',');
module.dat <- read.csv(file=paste(data.dir, '/modular_analysis/Blood modules.csv', sep=''),
header=TRUE,
sep=',');
count.dat <- read.table(file= paste(results.dir, 'genes.vst_counts_with_controls.xls.gz', sep=''),
header=TRUE,
sep='\t');
# Load DESeq results list
load(paste(r.dir, "Projects/results.Rdata", sep = ""))
deg.list <- lapply(res.list, function(x){
x <- x[x$DEG, ]
})
#List of Module IDs
module.num <- module.anno$X
print(module.anno[, -2])
## X Number.of.genes Biological.process
## 1 B1 169 Ribosome biogenesis
## 2 B2 53 Ribosome functions
## 3 B3 82 Mitochondria/Oxidative phosphorylation
## 4 B4 732 Myeloid cells/Membrane functions
## 5 B5 144 Macrophages/Cytokine signaling
## 6 B6 196 Myeloid cells/Lipid metabolism
## 7 B7 349 Myeloid cells/Granulocytes
## 8 B8 381 Oxidative phosphorylation/Metabolism/ATP
## 9 B9 408 Ribosome/Mitochondrial activty
## 10 B10 395 Cell cycle/DNA replication
## 11 B11 134 IFN signaling/Il10
## 12 B12 267 Oxidative phosphorylation
## 13 B13 301 Stress response/Protein folding/Ubiquitination
## 14 B14 172 IFN signaling
## 15 B15 81 Cytotoxic/T cells/ILC/Tbx21/Eomes
## 16 B16 82 Granulocytes
## 17 B17 220 Myeloid cells/Granulocytes
## 18 B18 367 Myeloid cells/Granulocytes
## 19 B19 60 T cells/Leukocytes
## 20 B20 126 Cytoskeleton/Calcium signaling
## 21 B21 254 Drug metabolism/Protease inhibitors
## 22 B22 30 Metabolism
## 23 B23 134 Drug metabolism/Protease inhibitors
## 24 B24 102 Tissue factor signaling in cancer
## 25 B25 123 Miscellaneous enzymes
## 26 B26 236 Miscellaneous Adhesion/Signaling
## 27 B27 279 Epithelial cell function
## 28 B28 264 B cell signaling anddevelopment
## 29 B29 116 B cell signaling anddevelopment
## 30 B30 207 B cell/Myeloid signaling
## 31 B31 631 RNA and DNA processes
## 32 B32 557 Histone modification
## 33 B33 182 Hormone receptor signaling
## 34 B34 458 Autophagy/Erythrocyte function/Heme
## 35 B35 412 Integrins/Platelets/Histones
## 36 B36 209 Miscellaneous
## 37 B37 198 T cell differentiation
## 38 B38 252 B cells/T cells and signaling
## 39 B39 356 Leukocyte signaling
## 40 B40 95 Cytoskeleton/Actin-based motility
## 41 B41 154 T cells/T cell signaling
Transforming data to create Modules into Geneset format for Qusage
module.geneset <- lapply(module.num, function(mod){
dat <- module.dat[module.dat$Module == mod, 'X'];
dat <- dat[!is.na(dat)]
mod <- dat
})
names(module.geneset) <- paste(module.num, module.anno$Biological.process)
#Check to see if Merge was correct
for (mod in names(module.geneset)){
name <- str_split(mod, " ")[[1]]
anno <- as.numeric(module.anno[module.anno$X == name[1], 'Number.of.genes']);
if (length(module.geneset[[mod]]) == anno){
print(paste("Geneset", mod, 'contains correct number of genes:', anno, sep=' '))
} else {
print('Module gene number does not match')
}
}
## [1] "Geneset B1 Ribosome biogenesis contains correct number of genes: 169"
## [1] "Geneset B2 Ribosome functions contains correct number of genes: 53"
## [1] "Geneset B3 Mitochondria/Oxidative phosphorylation contains correct number of genes: 82"
## [1] "Geneset B4 Myeloid cells/Membrane functions contains correct number of genes: 732"
## [1] "Geneset B5 Macrophages/Cytokine signaling contains correct number of genes: 144"
## [1] "Geneset B6 Myeloid cells/Lipid metabolism contains correct number of genes: 196"
## [1] "Geneset B7 Myeloid cells/Granulocytes contains correct number of genes: 349"
## [1] "Geneset B8 Oxidative phosphorylation/Metabolism/ATP contains correct number of genes: 381"
## [1] "Geneset B9 Ribosome/Mitochondrial activty contains correct number of genes: 408"
## [1] "Geneset B10 Cell cycle/DNA replication contains correct number of genes: 395"
## [1] "Geneset B11 IFN signaling/Il10 contains correct number of genes: 134"
## [1] "Geneset B12 Oxidative phosphorylation contains correct number of genes: 267"
## [1] "Geneset B13 Stress response/Protein folding/Ubiquitination contains correct number of genes: 301"
## [1] "Geneset B14 IFN signaling contains correct number of genes: 172"
## [1] "Geneset B15 Cytotoxic/T cells/ILC/Tbx21/Eomes contains correct number of genes: 81"
## [1] "Geneset B16 Granulocytes contains correct number of genes: 82"
## [1] "Geneset B17 Myeloid cells/Granulocytes contains correct number of genes: 220"
## [1] "Geneset B18 Myeloid cells/Granulocytes contains correct number of genes: 367"
## [1] "Geneset B19 T cells/Leukocytes contains correct number of genes: 60"
## [1] "Geneset B20 Cytoskeleton/Calcium signaling contains correct number of genes: 126"
## [1] "Geneset B21 Drug metabolism/Protease inhibitors contains correct number of genes: 254"
## [1] "Geneset B22 Metabolism contains correct number of genes: 30"
## [1] "Geneset B23 Drug metabolism/Protease inhibitors contains correct number of genes: 134"
## [1] "Geneset B24 Tissue factor signaling in cancer contains correct number of genes: 102"
## [1] "Geneset B25 Miscellaneous enzymes contains correct number of genes: 123"
## [1] "Geneset B26 Miscellaneous Adhesion/Signaling contains correct number of genes: 236"
## [1] "Geneset B27 Epithelial cell function contains correct number of genes: 279"
## [1] "Geneset B28 B cell signaling anddevelopment contains correct number of genes: 264"
## [1] "Geneset B29 B cell signaling anddevelopment contains correct number of genes: 116"
## [1] "Geneset B30 B cell/Myeloid signaling contains correct number of genes: 207"
## [1] "Geneset B31 RNA and DNA processes contains correct number of genes: 631"
## [1] "Geneset B32 Histone modification contains correct number of genes: 557"
## [1] "Geneset B33 Hormone receptor signaling contains correct number of genes: 182"
## [1] "Geneset B34 Autophagy/Erythrocyte function/Heme contains correct number of genes: 458"
## [1] "Geneset B35 Integrins/Platelets/Histones contains correct number of genes: 412"
## [1] "Geneset B36 Miscellaneous contains correct number of genes: 209"
## [1] "Geneset B37 T cell differentiation contains correct number of genes: 198"
## [1] "Geneset B38 B cells/T cells and signaling contains correct number of genes: 252"
## [1] "Geneset B39 Leukocyte signaling contains correct number of genes: 356"
## [1] "Geneset B40 Cytoskeleton/Actin-based motility contains correct number of genes: 95"
## [1] "Geneset B41 T cells/T cell signaling contains correct number of genes: 154"
Now we format the rlog data to the format we need
## Trimming data
rownames(count.dat) <- count.dat$gene_id
sample.cols <- grep(pattern = '^naive|d.\\.', x = names(count.dat), value = TRUE)
count.mat <- count.dat[ , sample.cols];
For this analysis, I have compared each transmission group separately against the control (i.e MT vs Naive, SBP vs Naive and RTMT vs Naive).
Briefly, the script will compare the treatment and control on a individual timepoint (e.g. Day 1), to calculate the fold enrichment of the module, once each module enrichment score is complete it will move onto the next timepoint until all calculations are completed. Qusage will output a log fold enrichment score for each module, as well as p-value for the comparison. Modules with p-value > 0.05 were removed from further analysis.
Finally, the script will look at all the genes within a module, and look to see whether they were DE within the similar comparisons done within DESeq2, and return the proportion of genes within the module that were DE.
# Load Qusage reults (If run already)
if(file.exists(paste(r.dir, "Projects/qusage_results.Rdata", sep = ""))){
load(paste(r.dir, "Projects/qusage_results.Rdata", sep = ""))
}
mvn.cols <- grepl(pattern = "naive|^mt", x= names(count.mat));
mvn.mat <- count.mat[, mvn.cols];
if(!exists("results.mvn")){
results.mvn <- lapply(c(1,2,3,4,6), function(d){
cols <- grepl(pattern = paste("d", d ,"|naive", sep=""), x = names(mvn.mat))
dat <- mvn.mat[, cols]
print(names(dat))
labels <- sapply(names(dat), function(x){
ifelse(grepl(pattern = "mt", x = x), "mt","naive")
})
contrast = "mt-naive";
qusage(dat, labels, contrast, module.geneset, n.points = 2^16)
})
names(results.mvn) <- c("Day1", "Day2", "Day3", "Day4", "Day6")
}
qusage.mvn <- lapply(names(results.mvn), function(x){
df <- qsTable(results.mvn[[x]], number = 41)
df$log.fold.change <- ifelse(df$p.Value < 0.05, df$log.fold.change, NA)
df <- df[ ,c(1,2)]
colnames(df) <- c("Module", paste(x, "_FC", sep = ""))
df
})
qusage.mvn.all <- Reduce(function(x,y) merge(x = x, y = y, by = "Module", no.dups=T),
qusage.mvn)
qusage.mvn.all <- qusage.mvn.all[match(names(module.geneset), qusage.mvn.all$Module), ]
qusage.mvn.all$Module <- factor(qusage.mvn.all$Module, levels = qusage.mvn.all$Module)
qusage.mvn.melt <- melt(qusage.mvn.all, id.vars = "Module")
qusage.mvn.melt$prop <-apply(qusage.mvn.melt, 1, function(prop){
if (!is.na(prop[["value"]])){
mod <- prop[[1]]
day <- as.numeric(str_match(string = prop[2], pattern = "[0-9]"))
geneset <- module.geneset[[mod]]
num.genes <- length(geneset)
deg.name <- grep(pattern = paste('^mt\\.', day, '_vs_naive', sep='') , x = names(deg.list), value = TRUE);
deg <- deg.list[[deg.name]]
deg <- deg[geneset, ] %>% drop_na()
output <- nrow(deg) / num.genes} else {output <- NA}
output
})
x <- ggplot(qusage.mvn.melt, aes(x = variable, y = reorder(Module, desc(Module)))) +
ggtitle('Mosquito Transmission') + xlab('Day') + ylab('Module') +
scale_x_discrete(labels= c(1,2,3,4,6)) +
geom_count(aes(size = prop, colour = value), na.rm = TRUE) +
labs(size = "Proportion of DEG in Module", colour = 'log2 Fold Change') +
scale_color_gradientn(colours = rev(brewer.pal(11, "RdBu")),values = rescale(c(-0.5,0,1.25)), limits = c(-0.5,1.25)) +
scale_size_area(max_size = 9) +
theme_bw() +
theme(panel.grid.minor = element_blank(),
panel.background = element_rect(fill = 'white'),
plot.background = element_blank(),
legend.background = element_rect(fill = 'transparent'))
x
Results of the modular analysis of Mosquito transmission vs Control. Analysis was performed on all timepoint:module combinations and comparisons with a p-value < 0.05 were removed. The colour of the bubble indicated the intensity of fold change with red showing an upregulation and blue showing a downregulation. Size of the bubble indicates how many genes within a module were DE during that timepoint:module combination.
svn.cols <- grepl(pattern = "naive|^sbp", x= names(count.mat));
svn.mat <- count.mat[, svn.cols];
if(!exists("results.svn")){
results.svn <- lapply(c(1,2,3,4,6), function(d){
cols <- grepl(pattern = paste("d", d ,"|naive", sep=""), x = names(svn.mat));
dat <- svn.mat[, cols]
print(names(dat))
labels <- sapply(names(dat), function(x){
ifelse(grepl(pattern = "sbp", x = x), "sbp","naive")
})
contrast = "sbp-naive";
qusage(dat, labels, contrast, module.geneset, n.points = 2^16);
})
names(results.svn) <- c("Day1", "Day2", "Day3", "Day4", "Day6")
}
qusage.svn <- lapply(names(results.svn), function(x){
df <- qsTable(results.svn[[x]], number = 41)
df$log.fold.change <- ifelse(df$p.Value < 0.05, df$log.fold.change, NA)
df <- df[ ,c(1,2)]
colnames(df) <- c("Module", paste(x, "_FC", sep = ""))
df
})
qusage.svn.all <- Reduce(function(x,y) merge(x = x, y = y, by = "Module", no.dups=T),
qusage.svn)
qusage.svn.all <- qusage.svn.all[match(names(module.geneset), qusage.svn.all$Module), ]
qusage.svn.all$Module <- factor(qusage.svn.all$Module, levels = qusage.svn.all$Module)
qusage.svn.melt <- melt(qusage.svn.all, id.vars = "Module")
qusage.svn.melt$prop <-apply(qusage.svn.melt, 1, function(prop){
if (!is.na(prop[["value"]])){
mod <- prop[[1]]
day <- as.numeric(str_match(string = prop[2], pattern = "[0-9]"))
geneset <- module.geneset[[mod]]
num.genes <- length(geneset)
deg.name <- grep(pattern = paste('^sbp\\.', day, '_vs_naive', sep='') , x = names(deg.list), value = TRUE);
deg <- deg.list[[deg.name]]
deg <- deg[geneset, ] %>% drop_na()
output <- nrow(deg) / num.genes} else {output <- NA}
output
})
x <- ggplot(qusage.svn.melt, aes(x = variable, y = reorder(Module, desc(Module)))) +
ggtitle('Serially Blood Passaged') + xlab('Day') + ylab('Module') +
scale_x_discrete(labels= c(1,2,3,4,6)) +
geom_count(aes(size = prop, colour = value), na.rm = TRUE) +
labs(size = "Proportion of DEG in Module", colour = 'Enrichment score') +
scale_color_gradientn(colours = rev(brewer.pal(11, "RdBu")),values = rescale(c(-0.5,0,1.25)), limits = c(-0.5,1.25)) +
scale_size_area(max_size = 9) +
theme_bw() +
theme(panel.grid.minor = element_blank(),
panel.background = element_rect(fill = 'white'),
plot.background = element_blank(),
legend.background = element_rect(fill = 'transparent'))
x
Results of the modular analysis of Serially Blood Passaged vs Control. Analysis was performed on all timepoint:module combinations and comparisons with a p-value < 0.05 were removed. The colour of the bubble indicated the intensity of fold change with red showing an upregulation and blue showing a downregulation. Size of the bubble indicates how many genes within a module were DE during that timepoint:module combination.
rvn.cols <- grepl(pattern = "naive|^rtmt", x= names(count.mat));
rvn.mat <- count.mat[, rvn.cols];
if(!exists("results.rvn")){
results.rvn <- lapply(c(1,2,3,4,6), function(d){
cols <- grepl(pattern = paste("d", d ,"|naive", sep=""), x = names(rvn.mat));
dat <- rvn.mat[, cols]
print(names(dat))
labels <- sapply(names(dat), function(x){
ifelse(grepl(pattern = "rtmt", x = x), "rtmt","naive")
})
print(labels)
contrast = "rtmt-naive";
qusage(dat, labels, contrast, module.geneset, n.points = 2^16);
})
names(results.rvn) <- c("Day1", "Day2", "Day3", "Day4", "Day6")
}
qusage.rvn <- lapply(names(results.rvn), function(x){
df <- qsTable(results.rvn[[x]], number = 41)
df$log.fold.change <- ifelse(df$p.Value < 0.05, df$log.fold.change, NA)
df <- df[ ,c(1,2)]
colnames(df) <- c("Module", paste(x, "_FC", sep = ""))
df
})
qusage.rvn.all <- Reduce(function(x,y) merge(x = x, y = y, by = "Module", no.dups=T),
qusage.rvn)
qusage.rvn.all <- qusage.rvn.all[match(names(module.geneset), qusage.rvn.all$Module), ]
qusage.rvn.all$Module <- factor(qusage.rvn.all$Module, levels = qusage.rvn.all$Module)
qusage.rvn.melt <- melt(qusage.rvn.all, id.vars = "Module")
qusage.rvn.melt$prop <-apply(qusage.rvn.melt, 1, function(prop){
if (!is.na(prop[["value"]])){
mod <- prop[[1]]
day <- as.numeric(str_match(string = prop[2], pattern = "[0-9]"))
geneset <- module.geneset[[mod]]
num.genes <- length(geneset)
deg.name <- grep(pattern = paste('^rtmt\\.', day, '_vs_naive', sep='') , x = names(deg.list), value = TRUE);
deg <- deg.list[[deg.name]]
deg <- deg[geneset, ] %>% drop_na()
output <- nrow(deg) / num.genes} else {output <- NA}
output
})
x <- ggplot(qusage.rvn.melt, aes(x = variable, y = reorder(Module, desc(Module)))) +
ggtitle('Recently Mosquito Transmitted') + xlab('Day') + ylab('Module') +
scale_x_discrete(labels= c(1,2,3,4,6)) +
geom_count(aes(size = prop, colour = value), na.rm = TRUE) +
labs(size = "Proportion of DEG in Module", colour = 'Enrichment score') +
scale_color_gradientn(colours = rev(brewer.pal(11, "RdBu")),values = rescale(c(-0.5,0,1.25)), limits = c(-0.5,1.25)) +
scale_size_area(max_size = 9) +
theme_bw() +
theme(panel.grid.minor = element_blank(),
panel.background = element_rect(fill = 'white'),
plot.background = element_blank(),
legend.background = element_rect(fill = 'transparent'))
x
Results of the modular analysis of Recently Mosquito transmission vs Control. Analysis was performed on all timepoint:module combinations and comparisons with a p-value < 0.05 were removed. The colour of the bubble indicated the intensity of fold change with red showing an upregulation and blue showing a downregulation. Size of the bubble indicates how many genes within a module were DE during that timepoint:module combination.
qusage.mvn.melt$transmission <- "MT"
qusage.rvn.melt$transmission <- "RTMT"
qusage.svn.melt$transmission <- "SBP"
qusage.all.melt <- do.call("rbind", list(qusage.mvn.melt, qusage.rvn.melt , qusage.svn.melt))
qusage.all.d3 <- qusage.all.melt[qusage.all.melt$variable == "Day3_FC", ]
x <- ggplot(qusage.all.d3, aes(x = transmission, y = reorder(Module, desc(Module)))) +
ggtitle('Day 3 - All Transmission') + xlab('Transmission') + ylab('Module') +
geom_count(aes(size = prop, colour = value), na.rm = TRUE) +
labs(size = "Proportion of DEG in Module", colour = 'Enrichment score') +
scale_color_gradientn(colours = rev(brewer.pal(11, "RdBu")),values = rescale(c(-0.5,0,1.25)), limits = c(-0.5,1.25)) +
scale_size_area(max_size = 9) +
theme_bw() +
theme(panel.grid.minor = element_blank(),
panel.background = element_rect(fill = 'white'),
plot.background = element_blank(),
legend.background = element_rect(fill = 'transparent'))
x
qusage.all.d4 <- qusage.all.melt[qusage.all.melt$variable == "Day4_FC", ]
x <- ggplot(qusage.all.d4, aes(x = transmission, y = reorder(Module, desc(Module)))) +
ggtitle('Day 4 - All Transmission') + xlab('Transmission') + ylab('Module') +
geom_count(aes(size = prop, colour = value), na.rm = TRUE) +
labs(size = "Proportion of DEG in Module", colour = 'Enrichment score') +
scale_color_gradientn(colours = rev(brewer.pal(11, "RdBu")),values = rescale(c(-0.5,0,1.25)), limits = c(-0.5,1.25)) +
scale_size_area(max_size = 9) +
theme_bw() +
theme(panel.grid.minor = element_blank(),
panel.background = element_rect(fill = 'white'),
plot.background = element_blank(),
legend.background = element_rect(fill = 'transparent'))
x
qusage.all.d6 <- qusage.all.melt[qusage.all.melt$variable == "Day6_FC", ]
x <- ggplot(qusage.all.d6, aes(x = transmission, y = reorder(Module, desc(Module)))) +
ggtitle('Day 6 - All Transmission') + xlab('Transmission') + ylab('Module') +
geom_count(aes(size = prop, colour = value), na.rm = TRUE) +
labs(size = "Proportion of DEG in Module", colour = 'Enrichment score') +
scale_color_gradientn(colours = rev(brewer.pal(11, "RdBu")),values = rescale(c(-0.5,0,1.25)), limits = c(-0.5,1.25)) +
scale_size_area(max_size = 9) +
theme_bw() +
theme(panel.grid.minor = element_blank(),
panel.background = element_rect(fill = 'white'),
plot.background = element_blank(),
legend.background = element_rect(fill = 'transparent'))
x
if(!file.exists(paste(r.dir, "Projects/qusage_results.Rdata", sep = ""))){
save(results.mvn, results.svn, results.rvn, file = paste(r.dir, "Projects/qusage_results.Rdata", sep = ""))
}